# -*- coding: utf-8 -*-
"""
Created on Sun Apr 21 22:21:19 2024

@author: Michael Young
"""

import pandas as pd
pd.options.plotting.backend = "plotly"

import plotly.io as pio
import plotly.graph_objects as go

#send plots to current browser. Export from browser as .png
pio.renderers.default='browser'

import pingouin
from scipy import stats

# read in Excel data file of leaders with 10K each of spontaneous and prepared
SP_leaders_DF = pd.read_excel(r"C:\Michael\Albany LAIO Lab and Research Projects\Spontaneous vs Prepared\Scores_by_Leader_Spontaneity.xlsx")

#get list of unique authors who meet criteria
unique_leaders = SP_leaders_DF['Author'].unique()

# set datatypes for reading Document_Data
dtype_dict = {'Author': 'str', 
              'Filename':'str', 
              'Date':'str', 
              'Sentence-Number':'int',
              'Topic': 'str',
              'HC': 'int',
              'LC':'int'}

Document_Data = pd.read_excel(r"C:\Michael\Albany LAIO Lab and Research Projects\Spontaneous vs Prepared\TALID document scores.xlsx", dtype=dtype_dict)


List_of_variables =['BACE',
                    'CC',
                    'DIS',
                    'IGB',
                    'PWR',
                    'SC',
                    'TASK',
                    'nACH_100',
                    'nAFF_100',
                    'nPWR_100',
                    'I1',
                    'P1',
                    'Diff_100',
                    'Int_100'
                    ]

#Leader_name=unique_leaders[1]
unique_leaders = ['Abbas_Mahmoud']


# A list to hold leader data
Leader_data=[]

for Leader_name in unique_leaders:
    One_leader_DF=Document_Data[Document_Data['Author']==Leader_name]
    
    # A list to hold p-values for each variable
    p_values=[]   
    
    for variable in List_of_variables:
        anova = One_leader_DF.anova(dv=variable, between='Spontaneity', detailed=True)
        try:
            # append p-value to list
            p_values.append(anova.at[0,'p-unc'])
        except:
            print("An exception occurred for ", Leader_name) 
    
    # Add leader name to front of list of p-values
    p_values.insert(0, Leader_name)
    
    Leader_data.append(p_values)


BACE_count=0
CC_count=0
DIS_count=0
IGB_count=0
PWR_count=0
SC_count=0
TASK_count=0
nACH_100_count=0
nAFF_100_count=0
nPWR_100_count=0
I1_count=0
P1_count=0
Diff_100_count=0
Int_100_count=0



for leader in Leader_data:
    
    leader_count = 0
    
    if leader[1] <= 0.05: 
        BACE_count+=1
        leader_count+=1
        
    if leader[2] <= 0.05: 
        CC_count+=1
        leader_count+=1
        
    if leader[3] <= 0.05: 
        DIS_count+=1
        leader_count+=1
        
    if leader[4] <= 0.05: 
        IGB_count+=1
        leader_count+=1
        
    if leader[5] <= 0.05: 
        PWR_count+=1
        leader_count+=1
        
    if leader[6] <= 0.05: 
        SC_count+=1
        leader_count+=1
        
    if leader[7] <= 0.05: 
        leader_count+=1
        TASK_count+=1
        
    if leader[8] <= 0.05: 
        nACH_100_count+=1
        leader_count+=1
        
    if leader[9] <= 0.05: 
        nAFF_100_count+=1
        leader_count+=1
    
    if leader[10] <= 0.05: 
        nPWR_100_count+=1
        leader_count+=1
        
    if leader[11] <= 0.05: 
        I1_count+=1
        leader_count+=1
        
    if leader[12] <= 0.05: 
        P1_count+=1
        leader_count+=1
        
    if leader[13] <= 0.05: 
        leader_count+=1
        Diff_100_count+=1
        
        
    if leader[14] <= 0.05: 
        Int_100_count+=1
        leader_count+=1
    
    leader.append(leader_count)

denominator= len(Leader_data)

# % significant differences for all leaders by variable

print("BACE: ", BACE_count, " ", round(BACE_count/denominator, 2))
print("CC: ", CC_count, " ", round(CC_count/denominator, 2))
print("DIS: ", DIS_count, " ", round(DIS_count/denominator,2))
print("IGB: ", IGB_count, " ", round(IGB_count/denominator, 2))
print("PWR: ", PWR_count, " ", round(PWR_count/denominator, 2))
print("SC: ", SC_count, " ", round(SC_count/denominator, 2))
print("TASK: ", TASK_count, " ", round(TASK_count/denominator, 2))
print("nACH_100: ", nACH_100_count, " ", round(nACH_100_count/denominator, 2))
print("nAFF_100: ", nAFF_100_count, " ", round(nAFF_100_count/denominator, 2))
print("nPWR_100: ", nPWR_100_count, " ", round(nPWR_100_count/denominator, 2))
print("I1: ", I1_count, " ", round(I1_count/denominator, 2))
print("P1: ", P1_count, " ", round(P1_count/denominator, 2))
print("Diff_100: ", Diff_100_count, " ", round(Diff_100_count/denominator, 2))
print("Int_100: ", Int_100_count, " ", round(Int_100_count/denominator, 2))


# Create Leader_data_DF to do more sophisticated operations
Leader_data_DF=pd.DataFrame.from_records(Leader_data, 
                                      columns= ['Author',
                                                'BACE_sig',
                                                'CC_sig',
                                                'DIS_sig',
                                                'IGB_sig',
                                                'PWR_sig',
                                                'SC_sig',
                                                'TASK_sig',
                                                'nACH_100_sig',
                                                'nAFF_100_sig',
                                                'nPWR_100_sig',
                                                'I1_sig',
                                                'P1_sig',
                                                'Diff_100_sig',
                                                'Int_100_sig',
                                                'sig_count'])


print("Number of sig differences per leader")
print("Mean: ", Leader_data_DF['sig_count'].mean())
print("Min: ", min(Leader_data_DF['sig_count'])) 
print("Max: ", max(Leader_data_DF['sig_count'])) 


Leader_data_DF['sig_count'].hist()

 # Get means into a form for plotting and merging
Leader_S_Means_DF=SP_leaders_DF[SP_leaders_DF['Spontaneity']=='Spontaneous']
Leader_S_Means_DF.reset_index(drop=True, inplace=True)
Leader_P_Means_DF=SP_leaders_DF[SP_leaders_DF['Spontaneity']=='Prepared']
Leader_P_Means_DF.reset_index(drop=True, inplace=True)

# merge into Leader_data_DF
Leader_means_DF=Leader_S_Means_DF.merge(Leader_P_Means_DF, on=['Author'], suffixes=('_S', '_P'))

# is the number of significant differences related to any of the mean scores?
Leader_sig_DF=Leader_data_DF[['Author','sig_count']]
Leader_sig_DF=Leader_sig_DF.merge(Leader_means_DF,on=['Author'])
correlations=Leader_sig_DF.corr(numeric_only = True)['sig_count']



List_of_variables =['BACE',
                    'CC',
                    'DIS',
                    'IGB',
                    'PWR',
                    'SC',
                    'TASK',
                    'nACH-100',
                    'nAFF-100',
                    'nPWR-100',
                    'I-1',
                    'P-1',
                    'DIFF-100',
                    'INT-100'
                    ]

# create a new variable for differences in Spontaneous and Prepared means
for variable in List_of_variables:
    Leader_means_DF[variable+'_diff']=Leader_means_DF[variable+'_S']-Leader_means_DF[variable+'_P']
    

# Calculate directional consistency as largest % higher.

print('Directional Consistency')
for variable in List_of_variables:
    Consistency = 0
    Spontaneous_count = 0
    Prepared_count = 0
    
    for index, row in Leader_means_DF.iterrows():
        if row[variable+'_diff'] > 0:
            Spontaneous_count += 1
        elif row[variable+'_diff'] < 0:
            Prepared_count += 1
        
    #print(Spontaneous_count,Prepared_count,denominator)
    Consistency=(max(Spontaneous_count/denominator, Prepared_count/denominator))
    if Spontaneous_count > Prepared_count:
        speech = "Spontaneous"
    else:
        speech = "Prepared"
    print(variable +" "+speech+" is higher " + str(round(Consistency,2)) + "%")
    

# Generate lollipop graphs for each variable

def min_value(row):
    return min(row[variable+'_S'],row[variable+'_P'])


for variable in List_of_variables:
    
    # set base as lesser of _S and _P
    
    Base= Leader_means_DF.apply(min_value, axis=1)
    
    # set height as difference of _S and _P
    Height= (abs(Leader_means_DF[variable+'_diff']))
    
    # figure out how to sort the two series=>Combine in df then sort!
    
    data = {'Base':Base,
           'Height': Height,
           'Author':Leader_means_DF['Author']
           }
    figure_DF=pd.concat(data, axis=1)
    figure_DF.sort_values(by=['Height'],inplace=True)
    

    #create go Figure
    fig = go.Figure()
    
        
    # Add traces
    fig.add_trace(go.Bar(x=figure_DF['Author'],
                         y=figure_DF['Height'],
                         base=figure_DF['Base'],
                         width=[0.2]*len(figure_DF['Height']),
                        name='Difference',
                        ))
    
    fig.add_trace(go.Scatter(x=Leader_means_DF['Author'],
                              y=Leader_means_DF[variable+'_P'],
                              mode='markers',
                              name='Prepared',
                              ))
    
    fig.add_trace(go.Scatter(x=Leader_means_DF['Author'],
                              y=Leader_means_DF[variable+'_S'],
                              mode='markers',
                              name='Spontaneous',
                              ))
    
    fig.update_layout(xaxis_type='category', 
                      title= "Spontaneous and Prepared means for " + variable + " for " + str(denominator) + " Leaders")
    #fig.update_yaxes(range=[0, 1])
    fig.show()
    fig.write_image("C:/Michael/Albany LAIO Lab and Research Projects/Spontaneous vs Prepared/"+variable+" spontaneity by leader.png", format='png',scale=2, engine='orca',width=1600, height=1000) 


# Calculate the ratio between Spontaneous and Prepared word count
Leader_means_DF['S_P_ratio']=Leader_means_DF['Word_Count_S']/Leader_means_DF['Word_Count_P']

print("Mean difference of means S/P")
for variable in List_of_variables:
    print(variable+'_diff', Leader_means_DF[variable+'_diff'].mean())
    
print("Max difference of means S/P")
for variable in List_of_variables:
    print(variable+'_diff', max(abs(Leader_means_DF[variable+'_diff'])))
    
    
    
    

from scipy.stats import pearsonr
print("Correlation with S/P ratio")  
for variable in List_of_variables:    
    print(variable+'_diff',pearsonr(Leader_means_DF[variable+'_diff'],Leader_means_DF['S_P_ratio']))




# Leader_means_DF.drop(columns=['Author', 'Spontaneity_S','Spontaneity_P']).corr())

print("Clean Exit?")  
    